## This script produces Table 1 of the main manuscript, as well as some additional stuff for the SI
## Replication package for Maine/Vermont prison voting paper, September 2020
## Email Ariel White with questions: arwhi@mit.edu

rm(list=ls())
setwd("/home/ariel/Dropbox (MIT)/PrisonVoting/replicationdata") #set local filepath

library(data.table)
library(xtable)
library(openxlsx)

## I'm including the code that read in a series of other publicly-available datasets and merged them together to produce our shared dataset
## but it's commented out because it won't all run unless you also download those datasets from various sources
## (I don't feel right including other people's raw datasets in our replication package without their original documentation)
## Of course, the code that constructs individual variables will run if you want to doublecheck it (all variables are included)
## If you're looking to run all of the commented code and can't find the right version of a given dataset online anymore, feel free to email

#states2018 <- data.table(read.csv("./voterturnout/2018 November General Election - Turnout Rates.csv", header=T, skip=1)) #downloaded from http://www.electproject.org/2018g on 5/29/2019

#states2018[, VEPcount2018 := as.numeric(gsub(",", "", Estimated.or.Actual.2018.Total.Ballots.Counted)) ]
#states2018[, VEPrate2018 := as.numeric(gsub("%", "", Estimated.or.Actual.2018.Total.Ballots.Counted.VEP.Turnout.Rate)) ]
#states2018[, noncitizenrate := as.numeric(gsub("%", "", X..Non.citizen))] #will use this to supplement the BJS data

#states2018 <- states2018[!is.na(VEPrate2018)==T, ] #drop those "notes" rows from the bottom
#states2018 <- states2018[!State.Abv=="DC"&!State.Abv=="VT"&!State.Abv=="ME",] #drop DC (no state prisons) & VT & ME

##and pull in state-level prison/jail estimates from the Vera Institute
#vera <- data.table(read.xlsx("./voterturnout/people-in-prison-in-2018-data.xlsx", sheet=3, startRow=6))
#vera <- vera[!is.na(X2),] #skip empty/summary/regional/notes rows
#vera <- vera[1:51,] #and more notes
#vera <- subset(vera, select=c("X2", "2018")) #keep just states and 2018 incarc. counts
#colnames(vera) <- c("X2", "felonyincarc18") 
#vera[, State := strsplit(X2, " \\(")[[1]][1], by=X2] #strip out parenthetical notes in state names
#vera[State=="U.S. Total", State := "United States"] #make this line up with main dataset

##also pull in ests of noncitizen incarcerated population from BJS
#noncit <- read.csv("./voterturnout/BJSprison/p17t10.csv", header=F, skip=11, stringsAsFactors=F) #read in, skip weird headers
#noncitizen <- data.table(noncit[!is.na(noncit$V2),c("V2", "V6")])
#colnames(noncitizen) <- c("State", "noncitizenpct")
#noncitizen[, State := strsplit(State, "/")[[1]][1], by=State] #strip out notes from state names
#noncitizen[, noncitizenpct := as.numeric(noncitizenpct)/100] 
#noncitizen <- noncitizen[!is.na(State),]; dim(noncitizen) #drop nonsense at the bottom
#noncitizen[State=="U.S. total", State := "United States"] #make this line up with main dataset

##now pull the datasets together
#states2018[, noncitizenpct := noncitizen$noncitizenpct[match(as.character(X), noncitizen$State)]] #pull in noncitizen ests (match by state name)
#states2018[, felonyincarc18 := vera$felonyincarc18[match(as.character(X), vera$State)]] #same thing for the Vera incarc. estimates. 
##fill in the missingness in the BJS noncitizen incarceration data using the ACS estimates for the general population citizenship rates (from M. McDonald data)
## this affects New Hampshire, New Mexico, North Dakota, Rhode Island
#states2018[is.na(noncitizenpct)==T, noncitizenpct := noncitizenrate/100 ]

### now, pull in state-level election results from 2018 (from MEDSL: https://github.com/MEDSL/2018-elections-official/blob/master/state_overall_2018.csv , downloaded 5/29/2019)
### and look at smallest statewide-race vote margin from 2018.
#medsl <- data.table(read.csv("./voterturnout/state_overall_2018.csv", stringsAsFactors=F))
#statewide <- medsl[district=="statewide",]; dim(statewide)
#dim(table(statewide$state)) #not all states but pretty good coverage
#statewide <- statewide[!is.na(candidate) & !(candidate=="Over Votes") & !(candidate=="Under Votes") & !(candidate=="None Of These Candidate") & !(candidate=="Blank Votes")& !(candidate=="Write-Ins") & !(candidate=="All Others"),] #drop non-candidate rows

##next up: want to add up all rows that represent the same candidate (to count all their votes), then figure out the vote margin for each race. 
#statewide[, candidatevotetotal := sum(candidatevotes, na.rm=T), by=c("year", "state", "office", "candidate")] #sum each candidate's votes
#statewide[, candidateobs := 1:.N, by=c("year", "state", "office", "candidate")]
#statewide1 <- statewide[candidateobs==1,]; dim(statewide1) #keep just one row per candidate

#statewide1[grepl("Governor", office)==T | grepl("Attorney General", office)==T | grepl("Auditor", office)==T | grepl("Comptroller", office)==T |grepl("Controller", office)==T  | grepl("Secretary of State", office)==T |grepl("Treasurer", office)==T |grepl("Secretary of the State", office)==T   , officekeep := 1] #keep major statewide offices
##statewide2 <- statewide1[officekeep==1 & is.na(candidatevotes)==F,]; dim(statewide2)
#statewide2 <- statewide1[officekeep==1 ,]; dim(statewide2)
#statewide2[, officekeep:=NULL]

##now also add in senate races.
#medslsen <- data.table(read.csv("./voterturnout/senate_overall_2018.csv", stringsAsFactors=F)) #downloaded 6/5/2019
#medslsen[, candidatevotetotal := sum(candidatevotes, na.rm=T), by=c("year", "state", "office", "candidate")] #sum each candidate's votes
#medslsen[, candidateobs := 1:.N, by=c("year", "state", "office", "candidate")]
#medslsen1 <- medslsen[candidateobs==1,]; dim(medslsen1) #keep just one row per candidate
#statewide3 <- data.table(rbind(statewide2, medslsen1)); nrow(medslsen1)+nrow(statewide2); nrow(statewide3)

#statewide3[, candidatesperrace:= .N, by=c("year", "state", "office")]
##note: some matches for 'governor' are running for board of college governors in MI. but those 28 rows for TN governor seem real-- they just report a lot of independents
#statewide3[, racemargin:= max(.SD$candidatevotetotal) - sort(.SD$candidatevotetotal,TRUE)[2], by= c("year", "state", "office")] #calculate winning margin for each race
#statewidefinal <- unique(statewide3, by= c("year", "state", "office")) #keep just one row for each race
#summary(statewidefinal) #note that there are a couple of NA's-- this is where people ran unopposed (so no #2)

#dim(table(statewidefinal$state)) 
##now sort within state and keep the closest race?
#setkey(statewidefinal, state, racemargin)
#statewidefinal1 <- statewidefinal[!is.na(racemargin),] #drop NAs from uncontested races
#statewidefinal1[, stateorder:= 1:.N, by=state]
#tightestraces <- statewidefinal1[stateorder==1, ]; dim(tightestraces)
#summary(tightestraces$racemargin)
#setkey(tightestraces, "racemargin"); head(tightestraces)

### now stick the vote-margin stuff onto the main states dataset. 
#states2018[, tightestracemargin := tightestraces$racemargin[match(as.character(X), tightestraces$state)]] 

##also, calculate the minimum turnout rate for incarcerated people that would swing that race.
#states2018[, felonyincarc18:= as.numeric(gsub(",", "", Prison)) ] 
#states2018[, felonycitincarc := felonyincarc18*(1-noncitizenpct)] #scale by non-citizen share of state pop.
#states2018[, incarcratetoswing := tightestracemargin/felonycitincarc ]
#summary(states2018$incarcratetoswing)
##at least 11.5%, but sometimes you'd need multiples of the # of people in prison. 

#states2018[, swingrate:= incarcratetoswing*100] #convert this to percentage for table display

##output the merged version of the dataset (this is what's included in the replication file)
#save(states2018, file="stateests_forTable1.Rdata")


# and now load the combined dataset in and make the table
load("stateests_forTable1.Rdata")

colskeep <- c("X", "VEPcount2018", "VEPrate2018", "tightestracemargin", "felonyincarc18",  "swingrate")
tablecolnames <- c("State", "Votes Cast", "\\% Turnout", "Tightest Margin", "Incarcerated", "\\%Turnout to Swing")
statetable <- xtable(subset(states2018, select=colskeep),format.args=list(big.mark=","), include.rownames=F, label="stateests",
caption="State-Level Elections Counterfactual Exercise", digits=c(0,0,0,2,0,0,2))
names(statetable) <- tablecolnames
print(statetable,include.rownames=F, sanitize.colnames.function = identity, tabular.environment="longtable",
      file="statetable_janrevision.tex") #this giant version of the table goes in the SI: Table SI.1 in Section 1.2 of the SI

#keep a version with the states where the counterfactual makes the most sense: this is main Table 1
counterfactstates <- c("Colorado", "DC", "Hawaii", "Illinois", "Indiana", "Maryland", "Massachusetts", "Michigan", "Montana", "Nevada", "New Hampshire", "New Jersey", "North Dakota", "Ohio", "Oregon", "Pennsylvania", "Rhode Island", "Utah")
test <- states2018[as.character(states2018$X) %in% counterfactstates,]
statetable_bestcf <- xtable(subset(states2018[as.character(states2018$X) %in% counterfactstates,], select=colskeep),format.args=list(big.mark=","), include.rownames=F, label="stateests_short",
                          caption="State-Level Elections Counterfactual Exercise", digits=c(0,0,0,0,0,0,0))
names(statetable_bestcf) <- tablecolnames
print(statetable_bestcf,include.rownames=F, sanitize.colnames.function = identity, 
      file="statetable_short_julyrevision.tex")







